# BASE
# ------------------------------------------------------
import numpy as np
import pandas as pd
import os
import gc
import warnings
# PACF - ACF
# ------------------------------------------------------
import statsmodels.api as sm
# DATA VISUALIZATION
# ------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# CONFIGURATIONS
# ------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format
warnings.filterwarnings('ignore')
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
stores = pd.read_csv("stores.csv")
transactions = pd.read_csv("transactions.csv").sort_values(["store_nbr", "date"])
# Datetime
train["date"] = pd.to_datetime(train.date)
test["date"] = pd.to_datetime(test.date)
transactions["date"] = pd.to_datetime(transactions.date)
# Data types
train.onpromotion = train.onpromotion.astype("float16")
train.sales = train.sales.astype("float32")
stores.cluster = stores.cluster.astype("int8")
train.head()
| id | date | store_nbr | family | sales | onpromotion | |
|---|---|---|---|---|---|---|
| 0 | 0 | 2013-01-01 | 1 | AUTOMOTIVE | 0.00 | 0.00 |
| 1 | 1 | 2013-01-01 | 1 | BABY CARE | 0.00 | 0.00 |
| 2 | 2 | 2013-01-01 | 1 | BEAUTY | 0.00 | 0.00 |
| 3 | 3 | 2013-01-01 | 1 | BEVERAGES | 0.00 | 0.00 |
| 4 | 4 | 2013-01-01 | 1 | BOOKS | 0.00 | 0.00 |
transactions.head(10)
| date | store_nbr | transactions | |
|---|---|---|---|
| 1 | 2013-01-02 | 1 | 2111 |
| 47 | 2013-01-03 | 1 | 1833 |
| 93 | 2013-01-04 | 1 | 1863 |
| 139 | 2013-01-05 | 1 | 1509 |
| 185 | 2013-01-06 | 1 | 520 |
| 231 | 2013-01-07 | 1 | 1807 |
| 277 | 2013-01-08 | 1 | 1869 |
| 323 | 2013-01-09 | 1 | 1910 |
| 369 | 2013-01-10 | 1 | 1679 |
| 415 | 2013-01-11 | 1 | 1813 |
temp = pd.merge(train.groupby(["date", "store_nbr"]).sales.sum().reset_index(), transactions, how = "left")
print("Spearman Correlation between Total Sales and Transactions: {:,.4f}".format(temp.corr("spearman").sales.loc["transactions"]))
px.line(transactions.sort_values(["store_nbr", "date"]), x='date', y='transactions', color='store_nbr',title = "Transactions" )
Spearman Correlation between Total Sales and Transactions: 0.8175
There is a stable pattern in Transaction. All months are similar except December from 2013 to 2017 by boxplot. In addition, we've just seen same pattern for each store in previous plot. Store sales had always increased at the end of the year.
a = transactions.copy()
a["year"] = a.date.dt.year
a["month"] = a.date.dt.month
px.box(a, x="year", y="transactions" , color = "month", title = "Transactions")
Let's take a look at transactions by using monthly average sales! We've just learned a pattern what increases sales. It was the end of the year. We can see that transactions increase in spring and decrease after spring.
a = transactions.set_index("date").resample("M").transactions.mean().reset_index()
a["year"] = a.date.dt.year
px.line(a, x='date', y='transactions', color='year',title = "Monthly Average Transactions" )
there is a highly correlation between total sales and transactions also.
px.scatter(temp, x = "transactions", y = "sales", trendline = "ols", trendline_color_override = "red")
Let's take a look at the days of week for shopping. It shows that stores make more transactions at weekends. Almost, the patterns are same from 2013 to 2017 and Saturday is the most important day for shopping.
a = transactions.copy()
a["year"] = a.date.dt.year
a["dayofweek"] = a.date.dt.dayofweek+1
a = a.groupby(["year", "dayofweek"]).transactions.mean().reset_index()
px.line(a, x="dayofweek", y="transactions" , color = "year", title = "Transactions")
There are some missing data points in the daily oil data as you can see below. You can treat the data by using various imputation methods. However, I chose a simple solution for that. Linear Interpolation is suitable for this time serie. You can see the trend and predict missing data points, when you look at a time serie plot of oil price.
# Import
oil = pd.read_csv("oil.csv")
oil["date"] = pd.to_datetime(oil.date)
oil.head()
| date | dcoilwtico | |
|---|---|---|
| 0 | 2013-01-01 | NaN |
| 1 | 2013-01-02 | 93.14 |
| 2 | 2013-01-03 | 92.97 |
| 3 | 2013-01-04 | 93.12 |
| 4 | 2013-01-07 | 93.20 |
# Resample
oil = oil.set_index("date").dcoilwtico.resample("D").sum().reset_index()
# Interpolate
oil["dcoilwtico"] = np.where(oil["dcoilwtico"] == 0, np.nan, oil["dcoilwtico"])
oil["dcoilwtico_interpolated"] =oil.dcoilwtico.interpolate()
# Plot
p = oil.melt(id_vars=['date']+list(oil.keys()[5:]), var_name='Legend')
px.line(p.sort_values(["Legend", "date"], ascending = [False, True]), x='date', y='value', color='Legend',title = "Daily Oil Price" )
Let's take a look at the correlation with Daily Oil Prices for sales and transactions. The correlation values are not strong but the sign of sales is negative. We can imagine that if daily oil price is high, we expect that the Ecuador's economy is bad and it means the price of product increases and sales decreases. There is a negative relationship here.
temp = pd.merge(temp, oil, how = "left")
print("Correlation with Daily Oil Prices")
print(temp.drop(["store_nbr", "dcoilwtico"], axis = 1).corr("spearman").dcoilwtico_interpolated.loc[["sales", "transactions"]], "\n")
fig, axes = plt.subplots(1, 2, figsize = (15,5))
temp.plot.scatter(x = "dcoilwtico_interpolated", y = "transactions", ax=axes[0])
temp.plot.scatter(x = "dcoilwtico_interpolated", y = "sales", ax=axes[1], color = "r")
axes[0].set_title('Daily oil price & Transactions', fontsize = 15)
axes[1].set_title('Daily Oil Price & Sales', fontsize = 15);
Correlation with Daily Oil Prices sales -0.30 transactions 0.04 Name: dcoilwtico_interpolated, dtype: float64
Most of the stores are similar to each other, when we examine them with correlation matrix. Some stores, such as 20, 21, 22, and 52 may be a little different.
a = train[["store_nbr", "sales"]]
a["ind"] = 1
a["ind"] = a.groupby("store_nbr").ind.cumsum().values
a = pd.pivot(a, index = "ind", columns = "store_nbr", values = "sales").corr()
mask = np.triu(a.corr())
plt.figure(figsize=(20, 20))
sns.heatmap(a,
annot=True,
fmt='.1f',
cmap='coolwarm',
square=True,
mask=mask,
linewidths=1,
cbar=False)
plt.title("Correlations among stores",fontsize = 20)
plt.show()
a = train.set_index("date").groupby("store_nbr").resample("D").sales.sum().reset_index()
px.line(a, x = "date", y= "sales", color = "store_nbr", title = "Daily total sales of the stores")
I realized some unnecessary rows in the data while I was looking at the time serie of the stores one by one. If you select the stores from above, some of them have no sales at the beginning of 2013. You can see them, if you look at the those stores 20, 21, 22, 29, 36, 42, 52 and 53. I decided to remove those rows before the stores opened. In the following codes, we will get rid of them.
print(train.shape)
train = train[~((train.store_nbr == 52) & (train.date < "2017-04-20"))]
train = train[~((train.store_nbr == 22) & (train.date < "2015-10-09"))]
train = train[~((train.store_nbr == 42) & (train.date < "2015-08-21"))]
train = train[~((train.store_nbr == 21) & (train.date < "2015-07-24"))]
train = train[~((train.store_nbr == 29) & (train.date < "2015-03-20"))]
train = train[~((train.store_nbr == 20) & (train.date < "2015-02-13"))]
train = train[~((train.store_nbr == 53) & (train.date < "2014-05-29"))]
train = train[~((train.store_nbr == 36) & (train.date < "2013-05-09"))]
train.shape
(3000888, 6)
(2780316, 6)
Some stores don't sell some product families. In the following code, you can see which products aren't sold in which stores. It isn't difficult to forecast them next 15 days. Their forecasts must be 0 next 15 days.
I will remove them from the data and create a new data frame for product families which never sell. Then, when we are at submission part, I will combine that data frame with our predictions.
c = train.groupby(["store_nbr", "family"]).sales.sum().reset_index().sort_values(["family","store_nbr"])
c = c[c.sales == 0]
c
| store_nbr | family | sales | |
|---|---|---|---|
| 1 | 1 | BABY CARE | 0.00 |
| 397 | 13 | BABY CARE | 0.00 |
| 727 | 23 | BABY CARE | 0.00 |
| 1420 | 44 | BABY CARE | 0.00 |
| 1453 | 45 | BABY CARE | 0.00 |
| 1486 | 46 | BABY CARE | 0.00 |
| 1519 | 47 | BABY CARE | 0.00 |
| 1552 | 48 | BABY CARE | 0.00 |
| 1585 | 49 | BABY CARE | 0.00 |
| 1618 | 50 | BABY CARE | 0.00 |
| 1651 | 51 | BABY CARE | 0.00 |
| 1684 | 52 | BABY CARE | 0.00 |
| 268 | 9 | BOOKS | 0.00 |
| 301 | 10 | BOOKS | 0.00 |
| 334 | 11 | BOOKS | 0.00 |
| 367 | 12 | BOOKS | 0.00 |
| 400 | 13 | BOOKS | 0.00 |
| 433 | 14 | BOOKS | 0.00 |
| 466 | 15 | BOOKS | 0.00 |
| 499 | 16 | BOOKS | 0.00 |
| 532 | 17 | BOOKS | 0.00 |
| 565 | 18 | BOOKS | 0.00 |
| 598 | 19 | BOOKS | 0.00 |
| 631 | 20 | BOOKS | 0.00 |
| 664 | 21 | BOOKS | 0.00 |
| 697 | 22 | BOOKS | 0.00 |
| 895 | 28 | BOOKS | 0.00 |
| 928 | 29 | BOOKS | 0.00 |
| 961 | 30 | BOOKS | 0.00 |
| 994 | 31 | BOOKS | 0.00 |
| 1027 | 32 | BOOKS | 0.00 |
| 1060 | 33 | BOOKS | 0.00 |
| 1093 | 34 | BOOKS | 0.00 |
| 1126 | 35 | BOOKS | 0.00 |
| 1159 | 36 | BOOKS | 0.00 |
| 1258 | 39 | BOOKS | 0.00 |
| 1291 | 40 | BOOKS | 0.00 |
| 1390 | 43 | BOOKS | 0.00 |
| 1687 | 52 | BOOKS | 0.00 |
| 1753 | 54 | BOOKS | 0.00 |
| 514 | 16 | LADIESWEAR | 0.00 |
| 811 | 25 | LADIESWEAR | 0.00 |
| 910 | 28 | LADIESWEAR | 0.00 |
| 943 | 29 | LADIESWEAR | 0.00 |
| 1042 | 32 | LADIESWEAR | 0.00 |
| 1075 | 33 | LADIESWEAR | 0.00 |
| 1141 | 35 | LADIESWEAR | 0.00 |
| 1306 | 40 | LADIESWEAR | 0.00 |
| 1405 | 43 | LADIESWEAR | 0.00 |
| 1768 | 54 | LADIESWEAR | 0.00 |
| 449 | 14 | LAWN AND GARDEN | 0.00 |
| 977 | 30 | LAWN AND GARDEN | 0.00 |
| 1769 | 54 | LAWN AND GARDEN | 0.00 |
print(train.shape)
# Anti Join
outer_join = train.merge(c[c.sales == 0].drop("sales",axis = 1), how = 'outer', indicator = True)
train = outer_join[~(outer_join._merge == 'both')].drop('_merge', axis = 1)
del outer_join
gc.collect()
train.shape
(2780316, 6)
(2698648, 6)
zero_prediction = []
for i in range(0,len(c)):
zero_prediction.append(
pd.DataFrame({
"date":pd.date_range("2017-08-16", "2017-08-31").tolist(),
"store_nbr":c.store_nbr.iloc[i],
"family":c.family.iloc[i],
"sales":0
})
)
zero_prediction = pd.concat(zero_prediction)
del c
gc.collect()
zero_prediction
| date | store_nbr | family | sales | |
|---|---|---|---|---|
| 0 | 2017-08-16 | 1 | BABY CARE | 0 |
| 1 | 2017-08-17 | 1 | BABY CARE | 0 |
| 2 | 2017-08-18 | 1 | BABY CARE | 0 |
| 3 | 2017-08-19 | 1 | BABY CARE | 0 |
| 4 | 2017-08-20 | 1 | BABY CARE | 0 |
| ... | ... | ... | ... | ... |
| 11 | 2017-08-27 | 54 | LAWN AND GARDEN | 0 |
| 12 | 2017-08-28 | 54 | LAWN AND GARDEN | 0 |
| 13 | 2017-08-29 | 54 | LAWN AND GARDEN | 0 |
| 14 | 2017-08-30 | 54 | LAWN AND GARDEN | 0 |
| 15 | 2017-08-31 | 54 | LAWN AND GARDEN | 0 |
848 rows × 4 columns
a = train.set_index("date").groupby("family").resample("D").sales.sum().reset_index()
px.line(a, x = "date", y= "sales", color = "family", title = "Daily total sales of the family")
We are working with the stores. Well, there are plenty of products in the stores and we need to know which product family sells much more? Let's make a barplot to see that. The graph shows us GROCERY I and BEVERAGES are the top selling families.
a = train.groupby("family").sales.mean().sort_values(ascending = False).reset_index()
px.bar(a, y = "family", x="sales", color = "family", title = "Which product family preferred more?")
How different can stores be from each other? The plot shows the patten of stores from different cities.
d = pd.merge(train, stores)
d["store_nbr"] = d["store_nbr"].astype("int8")
d["year"] = d.date.dt.year
px.line(d.groupby(["city", "year"]).sales.mean().reset_index(), x = "year", y = "sales", color = "city")
As we have so much rows in out dataset, it will be easier to group data, as example, by week or month. The aggregation will be made by mean.
def grouped(df, key, freq, col):
""" GROUP DATA WITH CERTAIN FREQUENCY """
df_grouped = df.groupby([pd.Grouper(key=key, freq=freq)]).agg(mean = (col, 'mean'))
df_grouped = df_grouped.reset_index()
return df_grouped
df_grouped_trans_w = grouped(transactions, 'date', 'W', 'transactions')
df_grouped_trans_w
| date | mean | |
|---|---|---|
| 0 | 2013-01-06 | 1883.20 |
| 1 | 2013-01-13 | 1641.09 |
| 2 | 2013-01-20 | 1639.02 |
| 3 | 2013-01-27 | 1609.82 |
| 4 | 2013-02-03 | 1685.26 |
| ... | ... | ... |
| 237 | 2017-07-23 | 1623.21 |
| 238 | 2017-07-30 | 1619.65 |
| 239 | 2017-08-06 | 1713.74 |
| 240 | 2017-08-13 | 1599.16 |
| 241 | 2017-08-20 | 1592.68 |
242 rows × 2 columns
for better forecasting we'll add 'time' column to our dataframe.
def add_time(df, key, freq, col):
""" ADD COLUMN 'TIME' TO DF """
df_grouped = grouped(df, key, freq, col)
df_grouped['time'] = np.arange(len(df_grouped.index))
column_time = df_grouped.pop('time')
df_grouped.insert(1, 'time', column_time)
return df_grouped
df_grouped_train_w = add_time(train, 'date', 'W', 'sales')
df_grouped_train_m = add_time(train, 'date', 'M', 'sales')
df_grouped_train_w.head() # check results
| date | time | mean | |
|---|---|---|---|
| 0 | 2013-01-06 | 0 | 250.23 |
| 1 | 2013-01-13 | 1 | 230.20 |
| 2 | 2013-01-20 | 2 | 229.66 |
| 3 | 2013-01-27 | 3 | 220.36 |
| 4 | 2013-02-03 | 4 | 240.22 |
After that, we can build some more plots. Linear regression is widely used in practice and adapts naturally to even complex forecasting tasks. The linear regression algorithm learns how to make a weighted sum from its input features.
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(30,20))
# TRANSACTIONS (WEEKLY)
axes[0].plot('date', 'mean', data=df_grouped_trans_w, color='grey', marker='o')
axes[0].set_title("Transactions (grouped by week)", fontsize=20)
# SALES (WEEKLY)
axes[1].plot('time', 'mean', data=df_grouped_train_w, color='0.75')
axes[1].set_title("Sales (grouped by week)", fontsize=20)
# linear regression
axes[1] = sns.regplot(x='time',
y='mean',
data=df_grouped_train_w,
scatter_kws=dict(color='0.75'),
ax = axes[1])
# SALES (MONTHLY)
axes[2].plot('time', 'mean', data=df_grouped_train_m, color='0.75')
axes[2].set_title("Sales (grouped by month)", fontsize=20)
# linear regression
axes[2] = sns.regplot(x='time',
y='mean',
data=df_grouped_train_m,
scatter_kws=dict(color='0.75'),
line_kws={"color": "red"},
ax = axes[2])
plt.show()
To make a lag feature we shift the observations of the target series so that they appear to have occured later in time. Here we've created a 1-step lag feature, though shifting by multiple steps is possible too. So, firstly, we should add lag to our data:
def add_lag(df, key, freq, col, lag):
""" ADD LAG """
df_grouped = grouped(df, key, freq, col)
name = 'Lag_' + str(lag)
df_grouped['Lag'] = df_grouped['mean'].shift(lag)
return df_grouped
df_grouped_train_w_lag1 = add_lag(train, 'date', 'W', 'sales', 1)
df_grouped_train_m_lag1 = add_lag(train, 'date', 'W', 'sales', 1)
df_grouped_train_w_lag1.head() # check data
| date | mean | Lag | |
|---|---|---|---|
| 0 | 2013-01-06 | 250.23 | NaN |
| 1 | 2013-01-13 | 230.20 | 250.23 |
| 2 | 2013-01-20 | 229.66 | 230.20 |
| 3 | 2013-01-27 | 220.36 | 229.66 |
| 4 | 2013-02-03 | 240.22 | 220.36 |
Let's build same plots, but with 'lag' feature:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(30,20))
axes[0].plot('Lag', 'mean', data=df_grouped_train_w_lag1, color='0.75', linestyle=(0, (1, 10)))
axes[0].set_title("Sales (grouped by week)", fontsize=20)
axes[0] = sns.regplot(x='Lag',
y='mean',
data=df_grouped_train_w_lag1,
scatter_kws=dict(color='0.75'),
ax = axes[0])
axes[1].plot('Lag', 'mean', data=df_grouped_train_m_lag1, color='0.75', linestyle=(0, (1, 10)))
axes[1].set_title("Sales (grouped by month)", fontsize=20)
axes[1] = sns.regplot(x='Lag',
y='mean',
data=df_grouped_train_m_lag1,
scatter_kws=dict(color='0.75'),
line_kws={"color": "red"},
ax = axes[1])
plt.show()
The trend component of a time series represents a persistent, long-term change in the mean of the series. The trend is the slowest-moving part of a series, the part representing the largest time scale of importance. In a time series of product sales, an increasing trend might be the effect of a market expansion as more people become aware of the product year by year.
To see what kind of trend a time series might have, we can use a moving average plot. To compute a moving average of a time series, we compute the average of the values within a sliding window of some defined width. Each point on the graph represents the average of all the values in the series that fall within the window on either side. The idea is to smooth out any short-term fluctuations in the series so that only long-term changes remain.
def plot_moving_average(df, key, freq, col, window, min_periods, ax, title):
df_grouped = grouped(df, key, freq, col)
moving_average = df_grouped['mean'].rolling(window=window, center=True, min_periods=min_periods).mean()
ax = df_grouped['mean'].plot(color='0.75', linestyle='dashdot', ax=ax)
ax = moving_average.plot(linewidth=3, color='g', ax=ax)
ax.set_title(title, fontsize=18)
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(30,20))
plot_moving_average(transactions, 'date', 'W', 'transactions', 7, 4, axes[0], 'Transactions Moving Average')
plot_moving_average(train, 'date', 'W', 'sales', 7, 4, axes[1], 'Sales Moving Average')
plt.show()